# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(naniar)
##
## Attaching package: 'naniar'
##
## The following object is masked from 'package:skimr':
##
## n_complete
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(broom)
library(countrycode)
library(ggrepel)
# Load Raw Data from source
load_energy_data <- function(url = "https://raw.githubusercontent.com/owid/energy-data/master/owid-energy-data.csv") {
read_csv(url)
}
# Filter out unnecessary data and high missing values
filter_data <- function(df) {
df %>%
filter(!str_starts(iso_code, "OWID"), !is.na(country), !is.na(year)) %>%
filter(year >= 2000) %>%
select(where(~ sum(is.na(.)) < 0.4 * nrow(df))) %>%
filter(!is.na(population), !is.na(gdp))
}
# Select only necessary columns
select_energy_columns <- function(df) {
df %>%
select(
country, iso_code, year, population, gdp,
electricity_generation,
renewables_electricity, fossil_electricity,
solar_electricity, wind_electricity, hydro_electricity,
renewables_share_elec, coal_share_elec, gas_share_elec, oil_share_elec
)
}
# Add normalized metrics : based on population and GDP
add_normalized_metrics <- function(df) {
df %>%
mutate(
electricity_per_capita = electricity_generation / population,
renewables_per_capita = renewables_electricity / population,
fossil_per_capita = fossil_electricity / population,
electricity_per_gdp = electricity_generation / gdp,
gdp_per_electricity = gdp / (electricity_generation + 1)
)
}
# Log Transforms - Highly skewed variables like population, electricity generation
add_log_transforms <- function(df) {
df %>%
mutate(
log_gdp = log(gdp + 1),
log_population = log(population + 1),
log_electricity = log(electricity_generation + 1)
)
}
# Check missing percentage
na_percentage <- function(df) {
sapply(df, function(col) round(mean(is.na(col)) * 100, 2))
}
# Add energy ratios - gives a perspective of percentages
add_energy_ratios <- function(df) {
df %>%
mutate(
fossil_to_renewable_ratio = fossil_electricity / (renewables_electricity + 1),
fossil_share_elec = fossil_electricity / (electricity_generation + 1),
solar_share = solar_electricity / (renewables_electricity + 1),
wind_share = wind_electricity / (renewables_electricity + 1),
hydro_share = hydro_electricity / (renewables_electricity + 1)
)
}
add_classification_flags <- function(df) {
df %>%
mutate(
high_renewable = if_else(renewables_share_elec > 50, 1, 0),
transitioning = if_else(renewables_share_elec > fossil_share_elec, 1, 0)
)
}
transform_energy_data <- function(df) {
df %>%
add_normalized_metrics() %>%
add_log_transforms() %>%
add_energy_ratios() %>%
add_classification_flags()
}
# Load and process
energy_raw <- load_energy_data()
## Rows: 21812 Columns: 130
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): country, iso_code
## dbl (128): year, population, gdp, biofuel_cons_change_pct, biofuel_cons_chan...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#write_csv(energy_raw, "energy_raw.csv")
cat("Number of rows in Raw Dataset :", nrow(energy_raw),'\n')
## Number of rows in Raw Dataset : 21812
cat("Number of columns in Raw Dataset :", ncol(energy_raw),'\n')
## Number of columns in Raw Dataset : 130
energy_clean <- filter_data(energy_raw)
cat("Number of rows in Filtered Dataset :", nrow(energy_clean),'\n')
## Number of rows in Filtered Dataset : 3795
cat("Number of columns in Filtered Dataset :", ncol(energy_clean),'\n')
## Number of columns in Filtered Dataset : 130
energy_selected <- select_energy_columns(energy_clean)
cat("Number of columns in Dataset after dropping unnecessary columns:", ncol(energy_selected),'\n')
## Number of columns in Dataset after dropping unnecessary columns: 15
energy_transformed <- transform_energy_data(energy_selected)
# Add continent info
energy_transformed <- energy_transformed %>%
mutate(continent = countrycode(country, "country.name", "continent"))
cat("Number of columns in Dataset after creating new columns and transformations:", ncol(energy_transformed),'\n')
## Number of columns in Dataset after creating new columns and transformations: 31
### Checking for duplicated and % of missing values
energy_transformed %>% filter(duplicated(.))
### Checking for missing values
print("Percentage of Missing Values left :")
## [1] "Percentage of Missing Values left :"
na_percentage(energy_transformed)
## country iso_code year
## 0.00 0.00 0.00
## population gdp electricity_generation
## 0.00 0.00 0.00
## renewables_electricity fossil_electricity solar_electricity
## 0.00 0.00 0.13
## wind_electricity hydro_electricity renewables_share_elec
## 0.00 1.92 0.13
## coal_share_elec gas_share_elec oil_share_elec
## 0.13 0.71 0.13
## electricity_per_capita renewables_per_capita fossil_per_capita
## 0.00 0.00 0.00
## electricity_per_gdp gdp_per_electricity log_gdp
## 0.00 0.00 0.00
## log_population log_electricity fossil_to_renewable_ratio
## 0.00 0.00 0.00
## fossil_share_elec solar_share wind_share
## 0.00 0.13 0.00
## hydro_share high_renewable transitioning
## 1.92 0.13 0.13
## continent
## 0.00
#write_csv(energy_transformed, "energy_transformed.csv")
# Filter for latest year
latest_year <- max(energy_transformed$year, na.rm = TRUE)
latest_data <- energy_transformed %>% filter(year == latest_year)
# Get top countries for different metrics
top_renewables <- latest_data %>% arrange(desc(renewables_electricity)) %>% slice(1)
top_generation <- latest_data %>% arrange(desc(electricity_generation)) %>% slice(1)
top_per_capita <- latest_data %>% arrange(desc(electricity_per_capita)) %>% slice(1)
top_gdp <- latest_data %>% arrange(desc(gdp)) %>% slice(1)
top_renew_share <- latest_data %>% arrange(desc(renewables_share_elec)) %>% slice(1)
# Print the results
cat(glue::glue("Year considered: {latest_year}\n"))
## Year considered: 2022
cat(glue::glue(" Country with most renewable electricity: {top_renewables$country} ({round(top_renewables$renewables_electricity, 2)} TWh)\n"))
## Country with most renewable electricity: China (2670.18 TWh)
cat(glue::glue(" Country with highest electricity generation: {top_generation$country} ({round(top_generation$electricity_generation, 2)} TWh)\n"))
## Country with highest electricity generation: China (8848.73 TWh)
cat(glue::glue(" Country with highest GDP: {top_gdp$country} (${format(round(top_gdp$gdp, 0), big.mark=',')})\n"))
## Country with highest GDP: China ($2.696602e+13)
cat(glue::glue(" Country with highest renewables share: {top_renew_share$country} ({round(top_renew_share$renewables_share_elec, 2)}%)\n"))
## Country with highest renewables share: Albania (100%)
co2_data <- read_csv("https://raw.githubusercontent.com/owid/co2-data/master/owid-co2-data.csv")
## Rows: 50191 Columns: 79
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): country, iso_code
## dbl (77): year, population, gdp, cement_co2, cement_co2_per_capita, co2, co2...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#write_csv(co2_data, "co2_raw.csv")
# Filter data for required years and missing values
co2_data = filter_data(co2_data)
# Select necessary columns
co2_data = co2_data %>% select(-c(cement_co2,cement_co2_per_capita,co2_growth_abs,co2_growth_prct,co2_including_luc_growth_abs,co2_including_luc_growth_prct,cumulative_cement_co2,cumulative_co2_including_luc,cumulative_luc_co2,flaring_co2,flaring_co2_per_capita,cumulative_flaring_co2,share_global_flaring_co2,share_global_cumulative_flaring_co2))
#COâ‚‚ emissions year-over-year change
co2_data = co2_data %>%
group_by(country) %>%
mutate(co2_pct_change = (co2 - lag(co2)) / lag(co2) * 100)
#Total COâ‚‚ from fossil fuel types
co2_data = co2_data %>%
mutate(fossil_fuel_co2 = coal_co2 + oil_co2 + gas_co2)
#log gdp
co2_data = co2_data %>% mutate(log_gdp = log(gdp))
#log co2
co2_data = co2_data %>% mutate(log_co2 = log(co2))
# write to csv
#write_csv(co2_transformed, "co2_transformed.csv")
co2_transformed <- co2_data
# Repeated column names in both datasets
co2_transformed <- co2_transformed %>%
select(-iso_code, -population, -gdp, -log_gdp)
#
energy_co2_merged <- left_join(
energy_transformed,
co2_transformed,
by = c("country", "year")
)
# Variable creation after merging datasets
get_co2_electricity_ratio <- function(df) {
df %>%
mutate(
co2_per_kwh = co2 / electricity_generation)
}
energy_co2_merged <- get_co2_electricity_ratio(energy_co2_merged)
# Save dataset
#write_csv(energy_co2_merged, "/Users/manasamangipudi/Desktop/Semester-3/DataWrangling/Project/data/energy_co2_data_merged.csv")
head(energy_co2_merged)
summary(energy_co2_merged)
## country iso_code year population
## Length:3795 Length:3795 Min. :2000 Min. :6.817e+04
## Class :character Class :character 1st Qu.:2005 1st Qu.:3.817e+06
## Mode :character Mode :character Median :2011 Median :1.004e+07
## Mean :2011 Mean :4.238e+07
## 3rd Qu.:2017 3rd Qu.:2.929e+07
## Max. :2022 Max. :1.426e+09
##
## gdp electricity_generation renewables_electricity
## Min. :3.129e+08 Min. : 0.00 Min. : 0.00
## 1st Qu.:2.354e+10 1st Qu.: 3.11 1st Qu.: 0.26
## Median :7.689e+10 Median : 15.37 Median : 3.09
## Mean :5.690e+11 Mean : 131.72 Mean : 29.21
## 3rd Qu.:3.505e+11 3rd Qu.: 66.06 3rd Qu.: 13.86
## Max. :2.697e+13 Max. :8848.73 Max. :2670.18
##
## fossil_electricity solar_electricity wind_electricity hydro_electricity
## Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.71 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.11
## Median : 5.96 Median : 0.000 Median : 0.000 Median : 2.13
## Mean : 86.56 Mean : 1.613 Mean : 3.947 Mean : 21.45
## 3rd Qu.: 38.19 3rd Qu.: 0.050 3rd Qu.: 0.250 3rd Qu.: 10.43
## Max. :5760.75 Max. :427.720 Max. :762.700 Max. :1321.71
## NA's :5 NA's :73
## renewables_share_elec coal_share_elec gas_share_elec oil_share_elec
## Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 4.942 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.8562
## Median : 22.355 Median : 0.00 Median : 8.48 Median : 5.0365
## Mean : 34.391 Mean : 14.14 Mean : 23.02 Mean : 23.8598
## 3rd Qu.: 60.269 3rd Qu.: 19.37 3rd Qu.: 37.74 3rd Qu.: 37.6215
## Max. :100.000 Max. :100.00 Max. :100.00 Max. :100.0000
## NA's :5 NA's :5 NA's :27 NA's :5
## electricity_per_capita renewables_per_capita fossil_per_capita
## Min. :0.000e+00 Min. :0.000e+00 Min. :0.000e+00
## 1st Qu.:4.482e-07 1st Qu.:3.516e-08 1st Qu.:1.278e-07
## Median :1.982e-06 Median :2.228e-07 Median :9.225e-07
## Mean :3.805e-06 Mean :1.263e-06 Mean :2.187e-06
## 3rd Qu.:5.117e-06 3rd Qu.:9.264e-07 3rd Qu.:2.987e-06
## Max. :5.603e-05 Max. :5.603e-05 Max. :2.413e-05
##
## electricity_per_gdp gdp_per_electricity log_gdp log_population
## Min. :0.000e+00 Min. :3.037e+08 Min. :19.56 Min. :11.13
## 1st Qu.:1.023e-10 1st Qu.:3.162e+09 1st Qu.:23.88 1st Qu.:15.15
## Median :1.710e-10 Median :5.019e+09 Median :25.07 Median :16.12
## Mean :2.059e-10 Mean :6.044e+09 Mean :25.19 Mean :16.11
## 3rd Qu.:2.520e-10 3rd Qu.:7.150e+09 3rd Qu.:26.58 3rd Qu.:17.19
## Max. :1.996e-09 Max. :3.968e+10 Max. :30.93 Max. :21.08
##
## log_electricity fossil_to_renewable_ratio fossil_share_elec solar_share
## Min. :0.000 Min. : 0.0000 Min. :0.0000 Min. :0.000000
## 1st Qu.:1.413 1st Qu.: 0.2512 1st Qu.:0.1898 1st Qu.:0.000000
## Median :2.795 Median : 1.0400 Median :0.4588 Median :0.000000
## Mean :2.909 Mean : 7.0289 Mean :0.4806 Mean :0.027620
## 3rd Qu.:4.206 3rd Qu.: 5.3100 3rd Qu.:0.8031 3rd Qu.:0.009275
## Max. :9.088 Max. :369.0841 Max. :0.9973 Max. :0.873275
## NA's :5
## wind_share hydro_share high_renewable transitioning
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.06492 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.00000 Median :0.47789 Median :0.0000 Median :1.0000
## Mean :0.05141 Mean :0.45371 Mean :0.3137 Mean :0.8364
## 3rd Qu.:0.03315 3rd Qu.:0.79907 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :0.79931 Max. :0.99366 Max. :1.0000 Max. :1.0000
## NA's :73 NA's :5 NA's :5
## continent co2 co2_including_luc
## Length:3795 Min. : 0.048 Min. : -7.975
## Class :character 1st Qu.: 4.418 1st Qu.: 10.343
## Mode :character Median : 20.541 Median : 39.296
## Mean : 192.954 Mean : 227.122
## 3rd Qu.: 90.416 3rd Qu.: 114.046
## Max. :11447.913 Max. :11169.235
## NA's :23 NA's :69
## co2_including_luc_per_capita co2_including_luc_per_gdp
## Min. :-2.829 Min. :-0.3210
## 1st Qu.: 1.586 1st Qu.: 0.2210
## Median : 3.925 Median : 0.3520
## Mean : 5.798 Mean : 0.6784
## 3rd Qu.: 7.712 3rd Qu.: 0.6420
## Max. :67.600 Max. :20.7680
## NA's :69 NA's :69
## co2_including_luc_per_unit_energy co2_per_capita co2_per_gdp
## Min. :-0.2050 Min. : 0.0220 Min. :0.0320
## 1st Qu.: 0.1860 1st Qu.: 0.6987 1st Qu.:0.1480
## Median : 0.2450 Median : 2.7300 Median :0.2280
## Mean : 0.8546 Mean : 4.9073 Mean :0.2831
## 3rd Qu.: 0.5050 3rd Qu.: 6.7192 3rd Qu.:0.3420
## Max. :42.7180 Max. :67.5840 Max. :2.1510
## NA's :166 NA's :23 NA's :23
## co2_per_unit_energy coal_co2 coal_co2_per_capita consumption_co2
## Min. :0.0290 Min. : 0.000 Min. : 0.000 Min. : 0.084
## 1st Qu.:0.1640 1st Qu.: 0.304 1st Qu.: 0.026 1st Qu.: 10.204
## Median :0.2050 Median : 3.359 Median : 0.326 Median : 43.264
## Mean :0.2082 Mean : 104.342 Mean : 1.343 Mean : 259.824
## 3rd Qu.:0.2360 3rd Qu.: 27.986 3rd Qu.: 1.907 3rd Qu.: 167.065
## Max. :1.6710 Max. :8168.899 Max. :13.952 Max. :10400.611
## NA's :120 NA's :888 NA's :888 NA's :1068
## consumption_co2_per_capita consumption_co2_per_gdp cumulative_co2
## Min. : 0.024 Min. :0.002 Min. : 1.4
## 1st Qu.: 1.154 1st Qu.:0.193 1st Qu.: 113.0
## Median : 4.108 Median :0.269 Median : 728.1
## Mean : 6.351 Mean :0.302 Mean : 8270.0
## 3rd Qu.: 9.630 3rd Qu.:0.387 3rd Qu.: 3738.3
## Max. :47.559 Max. :1.293 Max. :426941.3
## NA's :1068 NA's :1068 NA's :23
## cumulative_coal_co2 cumulative_gas_co2 cumulative_oil_co2
## Min. : 0.00 Min. : 0.00 Min. : 1.39
## 1st Qu.: 14.95 1st Qu.: 34.58 1st Qu.: 80.79
## Median : 181.83 Median : 219.74 Median : 334.06
## Mean : 5243.99 Mean : 1617.69 Mean : 2771.53
## 3rd Qu.: 1590.84 3rd Qu.: 1011.18 3rd Qu.: 1543.16
## Max. :195480.73 Max. :80314.20 Max. :163519.67
## NA's :888 NA's :1149 NA's :23
## cumulative_other_co2 energy_per_capita energy_per_gdp gas_co2
## Min. : 0.017 Min. : 105.1 Min. : 0.0780 Min. : 0.000
## 1st Qu.: 9.877 1st Qu.: 3416.6 1st Qu.: 0.8375 1st Qu.: 1.799
## Median : 26.866 Median : 13904.6 Median : 1.1740 Median : 10.374
## Mean : 152.141 Mean : 26824.9 Mean : 1.4287 Mean : 54.764
## 3rd Qu.: 83.343 3rd Qu.: 35500.2 3rd Qu.: 1.7995 3rd Qu.: 50.954
## Max. :5071.090 Max. :263541.6 Max. :10.1410 Max. :1748.499
## NA's :2760 NA's :120 NA's :120 NA's :1149
## gas_co2_per_capita ghg_excluding_lucf_per_capita ghg_per_capita
## Min. : 0.0000 Min. : 0.225 Min. : 0.552
## 1st Qu.: 0.1430 1st Qu.: 1.192 1st Qu.: 3.058
## Median : 0.8525 Median : 3.652 Median : 5.980
## Mean : 2.2055 Mean : 6.660 Mean : 8.599
## 3rd Qu.: 2.1150 3rd Qu.: 8.338 3rd Qu.: 10.336
## Max. :42.8220 Max. :120.443 Max. :120.802
## NA's :1149 NA's :69 NA's :46
## land_use_change_co2 land_use_change_co2_per_capita methane
## Min. :-286.0790 Min. :-6.6750 Min. : 0.052
## 1st Qu.: -0.1952 1st Qu.:-0.0500 1st Qu.: 5.203
## Median : 2.1440 Median : 0.2255 Median : 15.968
## Mean : 33.6902 Mean : 0.9370 Mean : 57.109
## 3rd Qu.: 18.3550 3rd Qu.: 1.3215 3rd Qu.: 46.815
## Max. :2805.2370 Max. :27.9400 Max. :1864.384
## NA's :69 NA's :69 NA's :46
## methane_per_capita nitrous_oxide nitrous_oxide_per_capita
## Min. : 0.244 Min. : 0.010 Min. :0.0280
## 1st Qu.: 0.826 1st Qu.: 1.377 1st Qu.:0.2260
## Median : 1.159 Median : 4.563 Median :0.3390
## Mean : 2.257 Mean : 16.250 Mean :0.5374
## 3rd Qu.: 1.924 3rd Qu.: 12.165 3rd Qu.:0.5670
## Max. :57.888 Max. :457.125 Max. :5.3980
## NA's :46 NA's :46 NA's :46
## oil_co2 oil_co2_per_capita other_co2_per_capita other_industry_co2
## Min. : 0.048 Min. : 0.013 Min. :0.0010 Min. : 0.0000
## 1st Qu.: 2.589 1st Qu.: 0.347 1st Qu.:0.0535 1st Qu.: 0.4575
## Median : 9.248 Median : 1.220 Median :0.0770 Median : 1.1580
## Mean : 62.562 Mean : 2.045 Mean :0.0921 Mean : 5.9598
## 3rd Qu.: 35.446 3rd Qu.: 2.769 3rd Qu.:0.1250 3rd Qu.: 4.0020
## Max. :2642.556 Max. :22.950 Max. :0.3020 Max. :177.2570
## NA's :23 NA's :23 NA's :2760 NA's :2760
## primary_energy_consumption share_global_cement_co2 share_global_co2
## Min. : 0.27 Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 28.65 1st Qu.: 0.0150 1st Qu.: 0.0140
## Median : 100.87 Median : 0.0610 Median : 0.0620
## Mean : 881.92 Mean : 0.6235 Mean : 0.5895
## 3rd Qu.: 485.16 3rd Qu.: 0.2400 3rd Qu.: 0.2670
## Max. :44516.31 Max. :52.1450 Max. :31.0470
## NA's :120 NA's :110 NA's :23
## share_global_co2_including_luc share_global_coal_co2
## Min. :-0.0240 Min. : 0.0000
## 1st Qu.: 0.0270 1st Qu.: 0.0020
## Median : 0.1040 Median : 0.0240
## Mean : 0.6051 Mean : 0.7908
## 3rd Qu.: 0.3140 3rd Qu.: 0.2080
## Max. :27.6250 Max. :53.8270
## NA's :69 NA's :888
## share_global_cumulative_cement_co2 share_global_cumulative_co2
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0110 1st Qu.: 0.0090
## Median : 0.0760 Median : 0.0530
## Mean : 0.6237 Mean : 0.5949
## 3rd Qu.: 0.3960 3rd Qu.: 0.2582
## Max. :34.9010 Max. :29.0640
## NA's :110 NA's :23
## share_global_cumulative_co2_including_luc share_global_cumulative_coal_co2
## Min. : 0.0000 Min. : 0.0000
## 1st Qu.: 0.0330 1st Qu.: 0.0020
## Median : 0.1140 Median : 0.0270
## Mean : 0.6361 Mean : 0.7908
## 3rd Qu.: 0.3100 3rd Qu.: 0.2420
## Max. :24.5990 Max. :26.4540
## NA's :69 NA's :888
## share_global_cumulative_gas_co2 share_global_cumulative_luc_co2
## Min. : 0.0000 Min. :-0.5580
## 1st Qu.: 0.0200 1st Qu.: 0.0230
## Median : 0.1170 Median : 0.1300
## Mean : 0.8685 Mean : 0.7841
## 3rd Qu.: 0.5258 3rd Qu.: 0.4908
## Max. :40.8490 Max. :17.8810
## NA's :1149 NA's :69
## share_global_cumulative_oil_co2 share_global_cumulative_other_co2
## Min. : 0.0000 Min. : 0.000
## 1st Qu.: 0.0160 1st Qu.: 0.176
## Median : 0.0690 Median : 0.377
## Mean : 0.5681 Mean : 2.222
## 3rd Qu.: 0.3172 3rd Qu.: 1.230
## Max. :30.4600 Max. :50.697
## NA's :23 NA's :2760
## share_global_gas_co2 share_global_luc_co2 share_global_oil_co2
## Min. : 0.0000 Min. :-8.0790 Min. : 0.0000
## 1st Qu.: 0.0280 1st Qu.:-0.0040 1st Qu.: 0.0230
## Median : 0.1660 Median : 0.0460 Median : 0.0810
## Mean : 0.8684 Mean : 0.6981 Mean : 0.5523
## 3rd Qu.: 0.8222 3rd Qu.: 0.3857 3rd Qu.: 0.3210
## Max. :26.3460 Max. :46.1720 Max. :24.2220
## NA's :1149 NA's :69 NA's :23
## share_global_other_co2 share_of_temperature_change_from_ghg
## Min. : 0.000 Min. : 0.0000
## 1st Qu.: 0.172 1st Qu.: 0.0550
## Median : 0.434 Median : 0.1500
## Mean : 2.222 Mean : 0.6048
## 3rd Qu.: 1.543 3rd Qu.: 0.3920
## Max. :58.136 Max. :19.9580
## NA's :2760 NA's :23
## temperature_change_from_ch4 temperature_change_from_co2
## Min. :-0.00100 Min. :0.000000
## 1st Qu.: 0.00000 1st Qu.:0.000000
## Median : 0.00100 Median :0.001000
## Mean : 0.00217 Mean :0.005752
## 3rd Qu.: 0.00200 3rd Qu.:0.003000
## Max. : 0.06300 Max. :0.239000
## NA's :46 NA's :23
## temperature_change_from_ghg temperature_change_from_n2o total_ghg
## Min. :0.000000 Min. :0.00000 Min. : 0.089
## 1st Qu.:0.001000 1st Qu.:0.00000 1st Qu.: 21.915
## Median :0.002000 Median :0.00000 Median : 60.260
## Mean :0.008391 Mean :0.00033 Mean : 296.246
## 3rd Qu.:0.005000 3rd Qu.:0.00000 3rd Qu.: 199.122
## Max. :0.285000 Max. :0.01100 Max. :13427.619
## NA's :23 NA's :46 NA's :46
## total_ghg_excluding_lucf trade_co2 trade_co2_share
## Min. : 0.099 Min. :-1532.0800 Min. :-98.849
## 1st Qu.: 8.705 1st Qu.: -1.3195 1st Qu.: -4.922
## Median : 33.772 Median : 2.3090 Median : 12.467
## Mean : 235.324 Mean : 0.0553 Mean : 26.469
## 3rd Qu.: 121.294 3rd Qu.: 9.7400 3rd Qu.: 38.595
## Max. :13012.948 Max. : 654.1420 Max. :568.635
## NA's :69 NA's :1068 NA's :1068
## co2_pct_change fossil_fuel_co2 log_co2 co2_per_kwh
## Min. :-55.097 Min. : 0.689 Min. :-3.037 Min. :0.06741
## 1st Qu.: -2.441 1st Qu.: 17.125 1st Qu.: 1.486 1st Qu.:1.13940
## Median : 1.828 Median : 55.835 Median : 3.022 Median :1.47016
## Mean : 2.817 Mean : 290.352 Mean : 2.982 Mean : Inf
## 3rd Qu.: 6.893 3rd Qu.: 209.087 3rd Qu.: 4.504 3rd Qu.:2.07686
## Max. :126.935 Max. :10427.785 Max. : 9.346 Max. : Inf
## NA's :187 NA's :1477 NA's :23 NA's :23
ggplot(energy_co2_merged %>% filter(year==2022), aes(x = population, y = co2, size = gdp, label = country)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_text_repel(max.overlaps = 5, size = 5) +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
labs(
title = "Fossil Fuel CO2 Emissions vs Population (Bubble Size = GDP)",
x = "Population",
y = "CO2 Emissions (log scale)",
size = "GDP"
) +
theme_minimal()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 155 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggplot(energy_co2_merged %>% filter(country %in% c("United States","India","China","Germany","Brazil")),aes(x=year,y=co2,color = country))+
geom_line(size=1)+
labs(title = "CO2 Emisions Over Time for Global Super Powers ",y="CO2 (Million Tonnes)")+
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
co2_change <- energy_co2_merged %>%
filter(year %in% c(2015, 2022)) %>%
select(country, year, co2,iso_code) %>%
pivot_wider(names_from = year, values_from = co2, names_prefix = "co2_") %>%
mutate(co2_diff_2015_2022 = co2_2022 - co2_2015)
co2_change %>%
drop_na(co2_diff_2015_2022,iso_code) %>%
arrange(co2_diff_2015_2022) %>%
slice(1:15) %>%
ggplot(aes(x = reorder(country, co2_diff_2015_2022,decreasing = TRUE), y = co2_diff_2015_2022)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Countries with Most Decrease in CO2 Emissions (2015–2022)",
x = "Country",
y = "Change in CO2 Emissions (Mt)"
) +
theme_minimal()
ggplot(energy_co2_merged, aes(x = continent, y = electricity_generation)) +
geom_boxplot(fill = "steelblue") +
theme_minimal() +
labs(title = "Electricity by Continent")
ggplot(energy_co2_merged, aes(x = continent, y = electricity_per_capita)) +
geom_boxplot(fill = "steelblue") +
theme_minimal() +
labs(title = "Electricity per Capita by Continent")
energy_co2_merged %>%
group_by(continent, year) %>%
summarise(
coal = sum(coal_share_elec, na.rm = TRUE),
gas = sum(gas_share_elec, na.rm = TRUE),
oil = sum(oil_share_elec, na.rm = TRUE)
) %>%
pivot_longer(cols = c("coal", "gas","oil"), names_to = "source", values_to = "electricity") %>%
ggplot(aes(x = year, y = electricity, color = source)) +
geom_line(size = 1) +
facet_wrap(~continent, scales = "free_y") +
theme_minimal() +
labs(title = "Growth of Coal, Gas, Oil by Continent", y = "Electricity (TWh)")
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
energy_co2_merged %>%
group_by(continent, year) %>%
summarise(
wind = sum(wind_share, na.rm = TRUE),
solar = sum(solar_share, na.rm = TRUE),
hydro = sum(hydro_share, na.rm = TRUE)
) %>%
pivot_longer(cols = c("wind", "solar","hydro"), names_to = "source", values_to = "electricity") %>%
ggplot(aes(x = year, y = electricity, color = source)) +
geom_line(size = 1) +
facet_wrap(~continent, scales = "free_y") +
theme_minimal() +
labs(title = "Growth of Wind, Solar, Hydro by Continent", y = "Electricity (TWh)")
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
energy_by_continent <- energy_transformed %>%
group_by(continent, year) %>%
summarise(
renewables = sum(renewables_electricity, na.rm = TRUE),
fossil = sum(fossil_electricity, na.rm = TRUE)
) %>%
pivot_longer(cols = c("renewables", "fossil"), names_to = "source", values_to = "electricity") %>%
ungroup()
## `summarise()` has grouped output by 'continent'. You can override using the
## `.groups` argument.
ggplot(energy_by_continent, aes(x = year, y = electricity, color = source)) +
geom_line(size = 1) +
facet_wrap(~continent, scales = "free_y") +
theme_minimal() +
labs(title = "Renewable vs Non-Renewable Electricity Generation by Continent Over Time")
ggplot(energy_co2_merged %>% filter(year==2022), aes(x = log_gdp, y = electricity_generation, size = fossil_to_renewable_ratio, label = country)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_text_repel(max.overlaps = 10, size = 5) +
scale_x_log10(labels = scales::comma) +
scale_y_log10(labels = scales::comma) +
theme_minimal()
## Warning: ggrepel: 155 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
energy_co2_merged %>%
group_by(year) %>%
summarise(
fossil = mean(fossil_electricity, na.rm = TRUE),
co2 = mean(co2, na.rm = TRUE),
renewable = mean(renewables_electricity, na.rm = TRUE)
) %>%
pivot_longer(cols = c("fossil", "co2", "renewable"), names_to = "source", values_to = "share") %>%
ggplot(aes(x = year, y = share, color = source)) +
geom_line(size = 1.2) +
theme_minimal() +
labs(
title = "Global Trend of Electricity Generation and co2 over time",
x = "Year",
y = "Share of Electricity (%)",
color = "Source"
)
library(tidyverse)
energy_co2_merged %>%
group_by(continent, year) %>%
summarise(
co2 = mean(co2, na.rm = TRUE),
fossil = mean(fossil_electricity, na.rm = TRUE),
renew = mean(renewables_share_elec, na.rm = TRUE),
.groups = 'drop'
) %>%
pivot_longer(cols = c(co2, fossil, renew), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = year, y = value, color = variable)) +
geom_line(size = 1) +
facet_wrap(~continent, scales = "free_y") +
theme_minimal() +
labs(
title = "CO2 Emissions, Fossil and Renewable Electricity Trends by Continent",
x = "Year",
y = "Electricity Share/Emissions",
color = "Metric"
)
energy_co2_merged %>%
group_by(continent, year) %>%
summarise(
fossil_elec = mean(fossil_electricity, na.rm = TRUE),
co2 = mean(co2, na.rm = TRUE),
.groups = "drop"
) %>%
ggplot(aes(x = fossil_elec, y = co2)) +
geom_point(alpha = 0.6, color = "firebrick") +
geom_smooth(method = "lm", color = "black") +
facet_wrap(~continent, scales = "free") +
labs(
title = "Fossil Electricity vs CO2 Emissions by Continent",
x = "Fossil Electricity (TWh)",
y = "CO2 Emissions (Mt)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
energy_co2_merged %>%
mutate(co2_per_twh = co2 /electricity_generation) %>%
group_by(continent, year) %>%
summarise(
co2_efficiency = mean(co2_per_twh, na.rm = TRUE),
.groups = "drop"
) %>%
ggplot(aes(x = year, y = co2_efficiency, color = continent)) +
geom_line(size = 1.2) +
labs(
title = "COâ‚‚ Emissions per Unit of Electricity Generated",
y = "COâ‚‚ / TWh",
x = "Year"
) +
theme_minimal()
energy_co2_merged %>%
mutate(ghg_per_twh = total_ghg /electricity_generation) %>%
group_by(continent, year) %>%
summarise(
ghg_efficiency = mean(ghg_per_twh, na.rm = TRUE),
.groups = "drop"
) %>%
ggplot(aes(x = year, y = ghg_efficiency, color = continent)) +
geom_line(size = 1.2) +
labs(
title = "GHG Emissions per Unit of Electricity Generated",
y = "GHG / TWh",
x = "Year"
) +
theme_minimal()
# 1. Filter for United States
us_data <- energy_co2_merged %>%
filter(country == "United States") %>%
select(-country, -iso_code, -continent, year) # Remove non-numeric/grouping vars
# 2. Remove columns with all NA or zero variance
us_data <- us_data %>%
select(where(is.numeric)) %>%
select(where(~ sum(!is.na(.)) > 0)) %>%
select(where(~ sd(., na.rm = TRUE) > 0))
# 3. Build models and extract p-values
results <- map_dfr(
setdiff(names(us_data), "co2"),
function(var) {
df <- us_data %>% select(co2, !!sym(var)) %>% drop_na()
if (nrow(df) < 10) return(NULL) # skip if not enough data
model <- lm(co2 ~ ., data = df)
tidy(model) %>%
filter(term != "(Intercept)") %>%
mutate(variable = var)
}
)
# 4. Print significant predictors (p >= 0.05)
significant <- results %>%
filter(p.value < 0.05) %>%
arrange(p.value)
head(significant %>% arrange(desc(estimate)))
library(ggrepel)
ggplot(energy_co2_merged %>% filter(year==2022), aes(x = renewables_per_capita, y = co2, size = population, label = country)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_text_repel(max.overlaps = 5, size = 4) +
scale_x_log10() +
scale_y_log10(labels = scales::comma) +
labs(
title = "Renewable Elec vs Co2 Emissions (Bubble Size = Population)",
x = "Renewables_per_capita",
y = "CO2 Emissions (log scale)",
size = "Population"
) +
theme_minimal()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 134 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# Global Scatter Plot: CO2 per Capita vs Renewables Share
energy_co2_merged <- energy_co2_merged %>%
mutate(co2_per_capita = co2 / population)
ggplot(energy_co2_merged, aes(x = renewables_share_elec, y = log(co2_per_capita))) +
geom_point(alpha = 0.4, color = "darkgreen") +
geom_smooth(method = "lm", color = "red") +
labs(
title = "Relationship between Renewable Electricity Share and CO2 Emissions per Capita",
x = "Renewables Share (%)",
y = "Log CO2 Emissions per Capita (tons)"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 28 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 28 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Global Regression Model
model_global <- lm(co2_per_capita ~ renewables_share_elec + gdp, data = energy_co2_merged)
summary(model_global)
##
## Call:
## lm(formula = co2_per_capita ~ renewables_share_elec + gdp, data = energy_co2_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.362e-06 -3.485e-06 -8.430e-07 1.318e-06 6.067e-05
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.811e-06 1.471e-07 46.314 <2e-16 ***
## renewables_share_elec -6.285e-08 2.958e-09 -21.248 <2e-16 ***
## gdp 4.601e-19 5.126e-20 8.976 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.028e-06 on 3764 degrees of freedom
## (28 observations deleted due to missingness)
## Multiple R-squared: 0.1335, Adjusted R-squared: 0.133
## F-statistic: 289.9 on 2 and 3764 DF, p-value: < 2.2e-16
top5 <- c("Germany", "India", "Brazil", "United States", "China")
energy_co2_merged %>%
filter(country %in% top5) %>%
ggplot(aes(x = year, y = co2 / population, color = country)) +
geom_line(size = 1) +
labs(
title = "CO2 Emissions per Capita Over Time (Top 5 Economies)",
y = "CO2 per Capita (tons)",
x = "Year"
) +
theme_minimal()
top5 <- c("United States", "China", "India", "Germany", "Brazil")
energy_co2_merged %>%
filter(country %in% top5) %>%
ggplot(aes(x = year, y = renewables_share_elec, color = country)) +
geom_line(size = 1) +
labs(
title = "Renewable Electricity Share Over Time (Top 5 Economies)",
y = "Renewables Share (%)",
x = "Year"
) +
theme_minimal()
energy_co2_merged %>%
group_by(year) %>%
summarise(avg_co2_per_kwh = mean(co2 / electricity_generation, na.rm = TRUE)) %>%
ggplot(aes(x = year, y = avg_co2_per_kwh)) +
geom_line(color = "firebrick", size = 1.2) +
labs(
title = "Global Average CO2 Emissions per kWh Over Time",
x = "Year",
y = "CO2 per kWh"
) +
theme_minimal()
energy_co2_merged %>%
filter(year == max(year)) %>%
mutate(co2_per_kwh = co2 / electricity_generation) %>%
ggplot(aes(x = renewables_share_elec, y = co2_per_kwh)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(
title = "Renewables Share vs COâ‚‚ per kWh (Latest Year)",
x = "Renewables Share (%)",
y = "COâ‚‚ per kWh"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
top5 <- c("United States", "China", "India", "Germany", "Brazil")
energy_co2_merged %>%
filter(country %in% top5) %>%
mutate(co2_per_kwh = co2 / electricity_generation) %>%
filter(!is.na(co2_per_kwh), co2_per_kwh < 5) %>% # Optional: filter extreme values
ggplot(aes(x = year, y = co2_per_kwh, color = country)) +
geom_line(size = 1.2) +
labs(
title = "CO2 Emissions per kWh Over Time (Top 5 Economies)",
x = "Year",
y = expression(CO[2]~"per kWh"),
color = "Country"
) +
theme_minimal()
library(ggrepel)
ggplot(energy_co2_merged %>% filter(year==2022), aes(x =renewables_share_elec , y = co2_per_kwh, size = population, label = country)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_text_repel(max.overlaps = 2, size = 4) +
scale_x_log10() +
scale_y_log10(labels = scales::comma) +
labs(
title = "Renewable Elec vs Co2 Emissions (Bubble Size = Population)",
x = "Renewables_per_capita",
y = "CO2 Emissions (log scale)",
size = "Population"
) +
theme_minimal()
## Warning in scale_x_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_text_repel()`).
## Warning: ggrepel: 148 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps